import re, random, math, locale
from base64 import b64encode, b64decode

class PyMediaWiki(object):

    MW_COLON_STATE_TEXT = 0
    MW_COLON_STATE_TAG = 1
    MW_COLON_STATE_TAGSTART = 2
    MW_COLON_STATE_CLOSETAG = 3
    MW_COLON_STATE_TAGSLASH = 4
    MW_COLON_STATE_COMMENT = 5
    MW_COLON_STATE_COMMENTDASH = 6
    MW_COLON_STATE_COMMENTDASHDASH = 7

    mTagHooks = {}
    mFunctionHooks = {}
    mFunctionSynonyms = [{},{}]

    availableNamespaces = dict([(u'base', 1)])

    def registerTagHook(self, tag, function):
        self.mTagHooks[tag] = function

    def registerFunctionHook(self, name, function, case_insensitive=False):
        self.mFunctionHooks[name] = function
        if case_insensitive:
            self.mFunctionSynonyms[0][name.lower()] = name
        else:
            self.mFunctionSynonyms[1][name] = name

    def registerMagicWord(self):
        assert False, 'Not implemented'

    def registerTemplate(self):
        assert False, 'Not implemented'

    def removeHtmlComments(self, text):
        """remove <!-- text --> comments from given text"""
        sb = []
        start = text.find(u'<!--')
        last = 0
        while start != -1:
            end = text.find(u'-->', start)
            if end == -1:
                break
            end += 3    
            
            spaceStart = max(0, start-1)
            spaceEnd = end
            while text[spaceStart] == u' ' and spaceStart > 0:
                spaceStart -= 1
            while text[spaceEnd] == u' ':
                spaceEnd += 1
            
            if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n':
                sb.append(text[last:spaceStart])
                sb.append(u'\n')
                last = spaceEnd+1
            else:
                sb.append(text[last:spaceStart+1])
                last = spaceEnd
            
            start = text.find(u'<!--', end)
        sb.append(text[last:])
        return u''.join(sb)

    _attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE)
    _space = re.compile(ur'\s+', re.UNICODE)

    def decodeTagAttributes(self, text):
        """docstring for self.decodeTagAttributes"""
        attribs = {}
        if text.strip() == u'':
            return attribs
        scanner = self._attributePat.scanner(text)
        match = scanner.search()
        while match:
            key, val1, val2, val3, val4 = match.groups()
            value = val1 or val2 or val3 or val4
            if value:
                value = self._space.sub(u' ', value).strip()
            else:
                value = ''
            attribs[key] = self.decodeCharReferences(value)
            
            match = scanner.search()
        return attribs
    
    def setupAttributeWhitelist(self):
        common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' )
        block = common + (u'align',)
        tablealign = ( u'align', u'char', u'charoff', u'valign' )
        tablecell = ( u'abbr',
                        u'axis',
                        u'headers',
                        u'scope',
                        u'rowspan',
                        u'colspan',
                        u'nowrap', # deprecated
                        u'width',  # deprecated
                        u'height', # deprecated
                        u'bgcolor' # deprecated
                        )
        return {
            u'div':     block,
            u'center':      common, # deprecated
            u'span':    block, # ??
            u'h1':      block,
            u'h2':      block,
            u'h3':      block,
            u'h4':      block,
            u'h5':      block,
            u'h6':      block,
            u'em':      common,
            u'strong':      common,
            u'cite':    common,
            u'code':    common,
            u'var':     common,
            u'img':     common + (u'src', u'alt', u'width', u'height',),
            u'blockquote':  common + (u'cite',),
            u'sub':     common,
            u'sup':     common,
            u'p':       block,
            u'br':      (u'id', u'class', u'title', u'style', u'clear',),
            u'pre':     common + (u'width',),
            u'ins':     common + (u'cite', u'datetime'),
            u'del':     common + (u'cite', u'datetime'),
            u'ul':      common + (u'type',),
            u'ol':      common + (u'type', u'start'),
            u'li':      common + (u'type', u'value'),
            u'dl':      common,
            u'dd':      common,
            u'dt':      common,
            u'table':       common + ( u'summary', u'width', u'border', u'frame',
                                        u'rules', u'cellspacing', u'cellpadding',
                                        u'align', u'bgcolor',
                                ),
            u'caption':     common + (u'align',),
            u'thead':       common + tablealign,
            u'tfoot':       common + tablealign,
            u'tbody':       common + tablealign,
            u'colgroup':    common + ( u'span', u'width' ) + tablealign,
            u'col':     common + ( u'span', u'width' ) + tablealign,
            u'tr':      common + ( u'bgcolor', ) + tablealign,
            u'td':      common + tablecell + tablealign,
            u'th':      common + tablecell + tablealign,
            u'tt':      common,
            u'b':       common,
            u'i':       common,
            u'big':     common,
            u'small':       common,
            u'strike':      common,
            u's':       common,
            u'u':       common,
            u'font':    common + ( u'size', u'color', u'face' ),
            u'hr':      common + ( u'noshade', u'size', u'width' ),
            u'ruby':    common,
            u'rb':      common,
            u'rt':      common, #array_merge( $common, array( 'rbspan' ) ),
            u'rp':      common,
        }
    _whitelist = property(setupAttributeWhitelist)

    def validateTagAttributes(self, attribs, element):
        """docstring for self.validateTagAttributes"""
        out = {}
        if element not in self._whitelist:
            return out
        whitelist = self._whitelist[element]
        for attribute in attribs:
            value = attribs[attribute]
            if attribute not in whitelist:
                continue
            # Strip javascript "expression" from stylesheets.
            # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
            if attribute == u'style':
                value = self.checkCss(value)
                if value == False:
                    continue
            elif attribute == u'id':
                value = self.escapeId(value)
            # If this attribute was previously set, override it.
            # Output should only have one attribute of each name.
            out[attribute] = value
        return out

    def safeEncodeAttribute(self, encValue):
        """docstring for self.safeEncodeAttribute"""
        encValue = encValue.replace(u'&', u'&amp;')
        encValue = encValue.replace(u'<', u'&lt;')
        encValue = encValue.replace(u'>', u'&gt;')
        encValue = encValue.replace(u'"', u'&quot;')
        encValue = encValue.replace(u'{', u'&#123;')
        encValue = encValue.replace(u'[', u'&#91;')
        encValue = encValue.replace(u"''", u'&#39;&#39;')
        encValue = encValue.replace(u'ISBN', u'&#73;SBN')
        encValue = encValue.replace(u'RFC', u'&#82;FC')
        encValue = encValue.replace(u'PMID', u'&#80;MID')
        encValue = encValue.replace(u'|', u'&#124;')
        encValue = encValue.replace(u'__', u'&#95;_')
        encValue = encValue.replace(u'\n', u'&#10;')
        encValue = encValue.replace(u'\r', u'&#13;')
        encValue = encValue.replace(u'\t', u'&#9;')
        return encValue

    def fixTagAttributes(self, text, element):
        if text.strip() == u'':
            return u''
        
        stripped = self.validateTagAttributes(self.decodeTagAttributes(text), element)
        
        sb = []
        
        for attribute in stripped:
            value = stripped[attribute]
            encAttribute = attribute.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
            encValue = self.safeEncodeAttribute(value)
            
            sb.append(u' ')
            sb.append(encAttribute)
            sb.append(u'="')
            sb.append(encValue)
            sb.append(u'"')
        
        return u''.join(sb)

    _tagPattern = re.compile(ur'^(/?)(\w+)([^>]*?)(/?>)([^<]*)$', re.UNICODE)   

    _htmlpairs = ( # Tags that must be closed
        u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1',
        u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's',
        u'strike', u'strong', u'tt', u'var', u'div', u'center',
        u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre',
        u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u'
    )
    _htmlsingle = (
        u'br', u'hr', u'li', u'dt', u'dd', u'img',
    )
    _htmlsingleonly = ( # Elements that cannot have close tags
        u'br', u'hr', u'img',
    )
    _htmlnest = ( # Tags that can be nested--??
        u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul',
        u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img',
    )
    _tabletags = ( # Can only appear inside table
        u'td', u'th', u'tr',
    )
    _htmllist = ( # Tags used by list
        u'ul', u'ol',
    )
    _listtags = ( # Tags that can appear in a list
        u'li',
    )
    _htmlsingleallowed = _htmlsingle + _tabletags 
    _htmlelements = _htmlsingle + _htmlpairs + _htmlnest

    def removeHtmlTags(self, text):
        """convert bad tags into HTML identities"""
        sb = []
        text = self.removeHtmlComments(text)
        bits = text.split(u'<')
        sb.append(bits.pop(0))
        tagstack = []
        tablestack = tagstack
        for x in bits:
            m = self._tagPattern.match(x)
            if not m:
                continue
            slash, t, params, brace, rest = m.groups()
            t = t.lower()
            badtag = False
            if t in self._htmlelements:
                # Check our stack
                if slash:
                    # Closing a tag...
                    if t in self._htmlsingleonly or len(tagstack) == 0:
                        badtag = True
                    else:
                        ot = tagstack.pop()
                        if ot != t:
                            if ot in self._htmlsingleallowed:
                                # Pop all elements with an optional close tag
                                # and see if we find a match below them
                                optstack = []
                                optstack.append(ot)
                                while True:
                                    if len(tagstack) == 0:
                                        break
                                    ot = tagstack.pop()
                                    if ot == t or ot not in self._htmlsingleallowed:
                                        break
                                    optstack.append(ot)
                                if t != ot:
                                    # No match. Push the optinal elements back again
                                    badtag = True
                                    tagstack += reversed(optstack)
                            else:
                                tagstack.append(ot)
                                # <li> can be nested in <ul> or <ol>, skip those cases:
                                if ot not in self._htmllist and t in self._listtags:
                                    badtag = True
                        elif t == u'table':
                            if len(tablestack) == 0:
                                bagtag = True
                            else:
                                tagstack = tablestack.pop()
                    newparams = u''
                else:
                    # Keep track for later
                    if t in self._tabletags and u'table' not in tagstack:
                        badtag = True
                    elif t in tagstack and t not in self._htmlnest:
                        badtag = True
                    # Is it a self-closed htmlpair? (bug 5487)
                    elif brace == u'/>' and t in self._htmlpairs:
                        badTag = True
                    elif t in self._htmlsingleonly:
                        # Hack to force empty tag for uncloseable elements
                        brace = u'/>'
                    elif t in self._htmlsingle:
                        # Hack to not close $htmlsingle tags
                        brace = None
                    else:
                        if t == u'table':
                            tablestack.append(tagstack)
                            tagstack = []
                        tagstack.append(t)
                    newparams = self.fixTagAttributes(params, t)
                if not badtag:
                    rest = rest.replace(u'>', u'&gt;')
                    if brace == u'/>':
                        close = u' /'
                    else:
                        close = u''
                    sb.append(u'<')
                    sb.append(slash)
                    sb.append(t)
                    sb.append(newparams)
                    sb.append(close)
                    sb.append(u'>')
                    sb.append(rest)
                    continue
            sb.append(u'&lt;')
            sb.append(x.replace(u'>', u'&gt;'))
        
        # Close off any remaining tags
        while tagstack:
            t = tagstack.pop()
            sb.append(u'</')
            sb.append(t)
            sb.append(u'>\n')
            if t == u'table':
                if not tablestack:
                    break
                tagstack = tablestack.pop()
        
        return u''.join(sb)

    _htmlEntities = {
        u'Aacute':   193,
        u'aacute':   225,
        u'Acirc':     194,
        u'acirc':     226,
        u'acute':     180,
        u'AElig':     198,
        u'aelig':     230,
        u'Agrave':   192,
        u'agrave':   224,
        u'alefsym': 8501,
        u'Alpha':     913,
        u'alpha':     945,
        u'amp':     38,
        u'and':     8743,
        u'ang':     8736,
        u'Aring':     197,
        u'aring':     229,
        u'asymp':     8776,
        u'Atilde':   195,
        u'atilde':   227,
        u'Auml':       196,
        u'auml':       228,
        u'bdquo':     8222,
        u'Beta':       914,
        u'beta':       946,
        u'brvbar':   166,
        u'bull':       8226,
        u'cap':     8745,
        u'Ccedil':   199,
        u'ccedil':   231,
        u'cedil':     184,
        u'cent':       162,
        u'Chi':     935,
        u'chi':     967,
        u'circ':       710,
        u'clubs':     9827,
        u'cong':       8773,
        u'copy':       169,
        u'crarr':     8629,
        u'cup':     8746,
        u'curren':   164,
        u'dagger':   8224,
        u'Dagger':   8225,
        u'darr':       8595,
        u'dArr':       8659,
        u'deg':     176,
        u'Delta':     916,
        u'delta':     948,
        u'diams':     9830,
        u'divide':   247,
        u'Eacute':   201,
        u'eacute':   233,
        u'Ecirc':     202,
        u'ecirc':     234,
        u'Egrave':   200,
        u'egrave':   232,
        u'empty':     8709,
        u'emsp':       8195,
        u'ensp':       8194,
        u'Epsilon': 917,
        u'epsilon': 949,
        u'equiv':     8801,
        u'Eta':     919,
        u'eta':     951,
        u'ETH':     208,
        u'eth':     240,
        u'Euml':       203,
        u'euml':       235,
        u'euro':       8364,
        u'exist':     8707,
        u'fnof':       402,
        u'forall':   8704,
        u'frac12':   189,
        u'frac14':   188,
        u'frac34':   190,
        u'frasl':     8260,
        u'Gamma':     915,
        u'gamma':     947,
        u'ge':       8805,
        u'gt':       62,
        u'harr':       8596,
        u'hArr':       8660,
        u'hearts':   9829,
        u'hellip':   8230,
        u'Iacute':   205,
        u'iacute':   237,
        u'Icirc':     206,
        u'icirc':     238,
        u'iexcl':     161,
        u'Igrave':   204,
        u'igrave':   236,
        u'image':     8465,
        u'infin':     8734,
        u'int':     8747,
        u'Iota':       921,
        u'iota':       953,
        u'iquest':   191,
        u'isin':       8712,
        u'Iuml':       207,
        u'iuml':       239,
        u'Kappa':     922,
        u'kappa':     954,
        u'Lambda':   923,
        u'lambda':   955,
        u'lang':       9001,
        u'laquo':     171,
        u'larr':       8592,
        u'lArr':       8656,
        u'lceil':     8968,
        u'ldquo':     8220,
        u'le':       8804,
        u'lfloor':   8970,
        u'lowast':   8727,
        u'loz':     9674,
        u'lrm':     8206,
        u'lsaquo':   8249,
        u'lsquo':     8216,
        u'lt':       60,
        u'macr':       175,
        u'mdash':     8212,
        u'micro':     181,
        u'middot':   183,
        u'minus':     8722,
        u'Mu':       924,
        u'mu':       956,
        u'nabla':     8711,
        u'nbsp':       160,
        u'ndash':     8211,
        u'ne':       8800,
        u'ni':       8715,
        u'not':     172,
        u'notin':     8713,
        u'nsub':       8836,
        u'Ntilde':   209,
        u'ntilde':   241,
        u'Nu':       925,
        u'nu':       957,
        u'Oacute':   211,
        u'oacute':   243,
        u'Ocirc':     212,
        u'ocirc':     244,
        u'OElig':     338,
        u'oelig':     339,
        u'Ograve':   210,
        u'ograve':   242,
        u'oline':     8254,
        u'Omega':     937,
        u'omega':     969,
        u'Omicron': 927,
        u'omicron': 959,
        u'oplus':     8853,
        u'or':       8744,
        u'ordf':       170,
        u'ordm':       186,
        u'Oslash':   216,
        u'oslash':   248,
        u'Otilde':   213,
        u'otilde':   245,
        u'otimes':   8855,
        u'Ouml':       214,
        u'ouml':       246,
        u'para':       182,
        u'part':       8706,
        u'permil':   8240,
        u'perp':       8869,
        u'Phi':     934,
        u'phi':     966,
        u'Pi':       928,
        u'pi':       960,
        u'piv':     982,
        u'plusmn':   177,
        u'pound':     163,
        u'prime':     8242,
        u'Prime':     8243,
        u'prod':       8719,
        u'prop':       8733,
        u'Psi':     936,
        u'psi':     968,
        u'quot':       34,
        u'radic':     8730,
        u'rang':       9002,
        u'raquo':     187,
        u'rarr':       8594,
        u'rArr':       8658,
        u'rceil':     8969,
        u'rdquo':     8221,
        u'real':       8476,
        u'reg':     174,
        u'rfloor':   8971,
        u'Rho':     929,
        u'rho':     961,
        u'rlm':     8207,
        u'rsaquo':   8250,
        u'rsquo':     8217,
        u'sbquo':     8218,
        u'Scaron':   352,
        u'scaron':   353,
        u'sdot':       8901,
        u'sect':       167,
        u'shy':     173,
        u'Sigma':     931,
        u'sigma':     963,
        u'sigmaf':   962,
        u'sim':     8764,
        u'spades':   9824,
        u'sub':     8834,
        u'sube':       8838,
        u'sum':     8721,
        u'sup':     8835,
        u'sup1':       185,
        u'sup2':       178,
        u'sup3':       179,
        u'supe':       8839,
        u'szlig':     223,
        u'Tau':     932,
        u'tau':     964,
        u'there4':   8756,
        u'Theta':     920,
        u'theta':     952,
        u'thetasym':   977,
        u'thinsp':   8201,
        u'THORN':     222,
        u'thorn':     254,
        u'tilde':     732,
        u'times':     215,
        u'trade':     8482,
        u'Uacute':   218,
        u'uacute':   250,
        u'uarr':       8593,
        u'uArr':       8657,
        u'Ucirc':     219,
        u'ucirc':     251,
        u'Ugrave':   217,
        u'ugrave':   249,
        u'uml':     168,
        u'upsih':     978,
        u'Upsilon': 933,
        u'upsilon': 965,
        u'Uuml':       220,
        u'uuml':       252,
        u'weierp':   8472,
        u'Xi':       926,
        u'xi':       958,
        u'Yacute':   221,
        u'yacute':   253,
        u'yen':     165,
        u'Yuml':       376,
        u'yuml':       255,
        u'Zeta':       918,
        u'zeta':       950,
        u'zwj':     8205,
        u'zwnj':       8204
    }

    _charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE)

    def validateCodepoint(self, codepoint):
        return codepoint == 0x09 \
            or codepoint == 0x0a \
            or codepoint == 0x0d \
            or (codepoint >=    0x20 and codepoint <=   0xd7ff) \
            or (codepoint >=  0xe000 and codepoint <=   0xfffd) \
            or (codepoint >= 0x10000 and codepoint <= 0x10ffff)

    def _normalizeCallback(self, match):
        text, norm, dec, hexval, _ = match.groups()
        if norm:
            sb = []
            sb.append(u'&')
            if norm not in self._htmlEntities:
                sb.append(u'amp;')
            sb.append(norm)
            sb.append(u';')
            return u''.join(sb)
        elif dec:
            dec = int(dec)
            if self.validateCodepoint(dec):
                sb = []
                sb.append(u'&#')
                sb.append(dec)
                sb.append(u';')
                return u''.join(sb)
        elif hexval:
            hexval = int(hexval, 16)
            if self.validateCodepoint(hexval):
                sb = []
                sb.append(u'&#x')
                sb.append(hex(hexval))
                sb.append(u';')
                return u''.join(sb)
        return text.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')

    def normalizeCharReferences(self, text):
        """docstring for self.normalizeCharReferences"""
        return self._charRefsPat.sub(self._normalizeCallback, text)

    def _decodeCallback(self, match):
        text, norm, dec, hexval, _ = match.groups()
        if norm:
            if norm in self._htmlEntities:
                return unichr(self._htmlEntities[norm])
            else:
                sb = []
                sb.append(u'&')
                sb.append(norm)
                sb.append(u';')
                return u''.join(sb)
        elif dec:
            dec = int(dec)
            if self.validateCodepoint(dec):
                return unichr(dec)
            return u'?'
        elif hexval:
            hexval = int(hexval, 16)
            if self.validateCodepoint(dec):
                return unichr(dec)
            return u'?'
        return text

    def decodeCharReferences(self, text):
        """docstring for self.decodeCharReferences"""
        if text:
            return self._charRefsPat.sub(self._decodeCallback, text)
        return ''

    _cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE)
    _toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE)
    _hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE)

    def _convertToUtf8(self, s):
        return unichr(int(s.group(1), 16))

    def checkCss(self, value):
        """docstring for self.checkCss"""
        stripped = self.decodeCharReferences(value)
        
        stripped = self._cssCommentPat.sub(u'', stripped)
        value = stripped
        
        stripped = self._toUTFPat.sub(self._convertToUtf8, stripped)
        stripped.replace(u'\\', u'')
        if self._hackPat.search(stripped):
            # someone is haxx0ring
            return False
        
        return value

    def escapeId(self, value):
        """docstring for self.escapeId"""
        # TODO
        return self.slugifyBit(value)

    _hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE)
    def parseHorizontalRule(self, text):
        return self._hrPat.sub(ur'<hr />', text)

    _h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE)
    _h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE)
    _h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE)
    _h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE)
    _h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE)
    _h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE)
    def parseHeaders(self, text):
        text = self._h6Pat.sub(ur'<h6>\1</h6>', text)
        text = self._h5Pat.sub(ur'<h5>\1</h5>', text)
        text = self._h4Pat.sub(ur'<h4>\1</h4>', text)
        text = self._h3Pat.sub(ur'<h3>\1</h3>', text)
        text = self._h2Pat.sub(ur'<h2>\1</h2>', text)
        text = self._h1Pat.sub(ur'<h1>\1</h1>', text)
        return text

    _quotePat = re.compile(u"""(''+)""", re.UNICODE)
    def parseQuotes(self, text):
        arr = self._quotePat.split(text)
        if len(arr) == 1:
            return text
        # First, do some preliminary work. This may shift some apostrophes from
        # being mark-up to being text. It also counts the number of occurrences
        # of bold and italics mark-ups.
        numBold = 0
        numItalics = 0
        for i,r in zip(range(len(arr)), arr):
            if i%2 == 1:
                l = len(r)
                if l == 4:
                    arr[i-1] += u"'"
                    arr[i] = u"'''"
                elif l > 5:
                    arr[i-1] += u"'" * (len(arr[i]) - 5)
                    arr[i] = u"'''''"
                if l == 2:
                    numItalics += 1
                elif l >= 5:
                    numItalics += 1
                    numBold += 1
                else:
                    numBold += 1
        
        # If there is an odd number of both bold and italics, it is likely
        # that one of the bold ones was meant to be an apostrophe followed
        # by italics. Which one we cannot know for certain, but it is more
        # likely to be one that has a single-letter word before it.
        if numBold%2 == 1 and numItalics%2 == 1:
            firstSingleLetterWord = -1
            firstMultiLetterWord = -1
            firstSpace = -1
            for i,r in zip(range(len(arr)), arr):
                if i%2 == 1 and len(r) == 3:
                    x1 = arr[i-1][-1:]
                    x2 = arr[i-1][-2:-1]
                    if x1 == u' ':
                        if firstSpace == -1:
                            firstSpace = i
                    elif x2 == u' ':
                        if firstSingleLetterWord == -1:
                            firstSingleLetterWord = i
                    else:
                        if firstMultiLetterWord == -1:
                            firstMultiLetterWord = i
            
            # If there is a single-letter word, use it!
            if firstSingleLetterWord > -1:
                arr[firstSingleLetterWord] = u"''"
                arr[firstSingleLetterWord-1] += u"'"
            # If not, but there's a multi-letter word, use that one.
            elif firstMultiLetterWord > -1:
                arr[firstMultiLetterWord] = u"''"
                arr[firstMultiLetterWord-1] += u"'"
            # ... otherwise use the first one that has neither.
            # (notice that it is possible for all three to be -1 if, for example,
            # there is only one pentuple-apostrophe in the line)
            elif firstSpace > -1:
                arr[firstSpace] = u"''"
                arr[firstSpace-1] += u"'"
        
        # Now let's actually convert our apostrophic mush to HTML!
        output = []
        buffer = None
        state = ''
        for i,r in zip(range(len(arr)), arr):
            if i%2 == 0:
                if state == 'both':
                    buffer.append(r)
                else:
                    output.append(r)
            else:
                if len(r) == 2:
                    if state == 'i':
                        output.append(u"</i>")
                        state = ''
                    elif state == 'bi':
                        output.append(u"</i>")
                        state = 'b'
                    elif state == 'ib':
                        output.append(u"</b></i><b>")
                        state = 'b'
                    elif state == 'both':
                        output.append(u"<b><i>")
                        output.append(u''.join(buffer))
                        buffer = None
                        output.append(u"</i>")
                        state = 'b'
                    elif state == 'b':
                        output.append(u"<i>")
                        state = 'bi'
                    else: # ''
                        output.append(u"<i>")
                        state = 'i'
                elif len(r) == 3:
                    if state == 'b':
                        output.append(u"</b>")
                        state = ''
                    elif state == 'bi':
                        output.append(u"</i></b><i>")
                        state = 'i'
                    elif state == 'ib':
                        output.append(u"</b>")
                        state = 'i'
                    elif state == 'both':
                        output.append(u"<i><b>")
                        output.append(u''.join(buffer))
                        buffer = None
                        output.append(u"</b>")
                        state = 'i'
                    elif state == 'i':
                        output.append(u"<b>")
                        state = 'ib'
                    else: # ''
                        output.append(u"<b>")
                        state = 'b'
                elif len(r) == 5:
                    if state == 'b':
                        output.append(u"</b><i>")
                        state = 'i'
                    elif state == 'i':
                        output.append(u"</i><b>")
                        state = 'b'
                    elif state == 'bi':
                        output.append(u"</i></b>")
                        state = ''
                    elif state == 'ib':
                        output.append(u"</b></i>")
                        state = ''
                    elif state == 'both':
                        output.append(u"<i><b>")
                        output.append(u''.join(buffer))
                        buffer = None
                        output.append(u"</b></i>")
                        state = ''
                    else: # ''
                        buffer = []
                        state = 'both'
        
        if state == 'both':
            output.append(u"<i><b>")
            output.append(u''.join(buffer))
            buffer = None
            output.append(u"</b></i>")
        elif state != '':
            if state == 'b' or state == 'ib':
                output.append(u"</b>")
            if state == 'i' or state == 'bi' or state == 'ib':
                output.append(u"</i>")
            if state == 'bi':
                output.append(u"</b>")
        return u''.join(output)

    def parseAllQuotes(self, text):
        sb = []
        lines = text.split(u'\n')
        first = True
        for line in lines:
            if not first:
                sb.append(u'\n')
            else:
                first = False
            sb.append(self.parseQuotes(line))
        return u''.join(sb)

    _removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from",
                                u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per",
                                u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via",
                                u"with")) + ur')\b', re.UNICODE | re.IGNORECASE)
    _nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE)
    _multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE)
    _spacePat = re.compile(ur' ', re.UNICODE)

    def slugifyBit(self, bit):
        #bit = self._removePat.sub(u'', bit)
        bit = self._nonWordSpaceDashPat.sub(u'', bit)
        bit = self._multiSpacePat.sub(u' ', bit)
        bit = bit.strip()
        bit = self._spacePat.sub(u'-', bit)
        bit = bit.lower()
        return bit

    def slugify(self, text):
        text = self.to_unicode(text)
        return u'/'.join(self.slugifyBit(t) for t in text.split(u'/'))

    _linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([A-Za-z0-9_\.\-\s\/:]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL)
    def replaceInternalLinks(self, text):
        arr = text.split('[[')
        sb = []
        sb.append(arr.pop(0))
        special_ns = [u'wiki', u'special']
        for bit in arr:
            namespace, link, alt, rest = None, None, None, None
            match = self._linkPat.match(bit)
            if match:
                namespace, link, alt, rest = match.groups()
            if link:
                if not namespace:
                    namespace = u'wiki'
                elif namespace not in special_ns:
                    if not self.availableNamespaces.has_key(namespace):
                        namespace = u'wiki'
                namespace = self.slugify(namespace)
                sb.append(u'<a href="')
                sb.append(u'/wiki/')
                if namespace in special_ns and namespace != u'wiki':
                    sb.append(str(namespace))
                    sb.append(u'/')
                sb.append(self.slugify(link))
                if alt:
                    link = alt
                sb.append(u'/">')
                sb.append(link)
                sb.append(u'</a>')

                sb.append(rest)
            else:
                sb.append(u'[[')
                sb.append(bit)
        return u''.join(sb)


    def checkTOC(self, text):
        showToc = True
        if text.find(u"__NOTOC__") != -1:
            text = text.replace(u"__NOTOC__", u"")
            showToc = False
        if text.find(u"__TOC__") != -1:
            text = text.replace(u"__TOC__", u"<!--MWTOC-->")
            showToc = True
        return text, showToc

    _bracketedLinkPat = re.compile(ur'(?:\[((?:irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE)
    def replaceExternalLinks(self, text):
        sb = []
        bits = self._bracketedLinkPat.split(text)
        l = len(bits)
        i = 0
        num_links = 0
        while i < l:
            if i%3 == 0:
                #sb.append(self.replaceFreeExternalLinks(bits[i]))
                sb.append(bits[i])
                i += 1
            else:
                sb.append(u'<a href="')
                sb.append(bits[i])
                sb.append(u'">')
                if not bits[i+1]:
                    num_links += 1
                    sb.append(self.to_unicode(self.truncate_url(bits[i])))
                else:
                    sb.append(bits[i+1])
                sb.append(u'</a>')
                i += 2
        return ''.join(sb)

    _protocolPat = re.compile(ur'(\b(?:irc://|https?://|ftp://))', re.UNICODE)
    _specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE)
    _protocolsPat = re.compile(ur'^(irc://|https?://|ftp://)$', re.UNICODE)

    def replaceFreeExternalLinks(self, text):
        bits = self._protocolPat.split(text)
        sb = [bits.pop(0)]
        i = 0
        l = len(bits)
        while i < l:
            protocol = bits[i]
            remainder = bits[i+1]
            i += 2
            match = self._specialUrlPat.match(remainder)
            if match:
                # Found some characters after the protocol that look promising
                url = protocol + match.group(1)
                trail = match.group(2)
                
                # special case: handle urls as url args:
                # http://www.example.com/foo?=http://www.example.com/bar
                if len(trail) == 0 and len(bits) > i and self._protocolsPat.match(bits[i]):
                    match = self._specialUrlPat.match(remainder)
                    if match:
                        url += bits[i] + match.group(1)
                        i += 2
                        trail = match.group(2)
                
                # The characters '<' and '>' (which were escaped by
                # removeHTMLtags()) should not be included in
                # URLs, per RFC 2396.
                pos = max(url.find('&lt;'), url.find('&gt;'))
                if pos != -1:
                    trail = url[pos:] + trail
                    url = url[0:pos]
                
                sep = ',;.:!?'
                if '(' not in url:
                    sep += ')'
                    
                i = len(url)-1
                while i >= 0:
                    char = url[i]
                    if char not in sep:
                        break
                    i -= 1
                i += 1
                
                if i != len(url):
                    trail = url[i:] + trail
                    url = url[0:i]
                
                url = self.cleanURL(url)
                
                sb.append(u'<a href="')
                sb.append(url)
                sb.append(u'">')
                sb.append(self.truncate_url(url))
                sb.append(u'</a>')
                #sb.append(text)
                sb.append(trail)
            else:
                sb.append(protocol)
                sb.append(remainder)
        return ''.join(sb)

    def urlencode(self, char):
        num = ord(char)
        if num == 32:
            return '+'
        return "%%%02x" % num

    _controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE)
    _hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE)
    _stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE)
    def cleanURL(self, url):
        # Normalize any HTML entities in input. They will be
        # re-escaped by makeExternalLink().
        url = self.decodeCharReferences(url)
        
        # Escape any control characters introduced by the above step
        url = self._controlCharsPat.sub(self.urlencode, url)
        
        # Validate hostname portion
        match = self._hostnamePat.match(url)
        if match:
            protocol, host, rest = match.groups()
            
            # Characters that will be ignored in IDNs.
            # http://tools.ietf.org/html/3454#section-3.1
            # Strip them before further processing so blacklists and such work.
            
            self._stripPat.sub('', host)
            
            # @fixme: validate hostnames here
            
            return protocol + host + rest
        else:
            return url

    _zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE)

    def doTableStuff(self, text, state):
        t = text.split(u"\n")
        td = [] # Is currently a td tag open?
        ltd = [] # Was it TD or TH?
        tr = [] # Is currently a tr tag open?
        ltr = [] # tr attributes
        has_opened_tr = [] # Did this table open a <tr> element?
        indent_level = 0 # indent level of the table
        
        for k, x in zip(range(len(t)), t):
            x = x.strip()
            fc = x[0:1]
            matches = self._zomgPat.match(x)
            if matches:
                indent_level = len(matches.group(1))
                
                attributes = self.unstripForHTML(matches.group(2), state)
                
                t[k] = u'<dl><dd>'*indent_level + u'<table' + self.fixTagAttributes(attributes, u'table') + u'>'
                td.append(False)
                ltd.append(u'')
                tr.append(False)
                ltr.append(u'')
                has_opened_tr.append(False)
            elif len(td) == 0:
                pass
            elif u'|}' == x[0:2]:
                z = u"</table>" + x[2:]
                l = ltd.pop()
                if not has_opened_tr.pop():
                    z = u"<tr><td></td><tr>" + z
                if tr.pop():
                    z = u"</tr>" + z
                if td.pop():
                    z = u'</' + l + u'>' + z
                ltr.pop()
                t[k] = z + u'</dd></dl>'*indent_level
            elif u'|-' == x[0:2]: # Allows for |-------------
                x = x[1:]
                while x != u'' and x[0:1] == '-':
                    x = x[1:]
                z = ''
                l = ltd.pop()
                has_opened_tr.pop()
                has_opened_tr.append(True)
                if tr.pop():
                    z = u'</tr>' + z
                if td.pop():
                    z = u'</' + l + u'>' + z
                ltr.pop()
                t[k] = z
                tr.append(False)
                td.append(False)
                ltd.append(u'')
                attributes = self.unstripForHTML(x, state)
                ltr.append(self.fixTagAttributes(attributes, u'tr'))
            elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption
                # x is a table row
                if u'|+' == x[0:2]:
                    fc = u'+'
                    x = x[1:]
                x = x[1:]
                if fc == u'!':
                    x = x.replace(u'!!', u'||')
                # Split up multiple cells on the same line.
                # FIXME: This can result in improper nesting of tags processed
                # by earlier parser steps, but should avoid splitting up eg
                # attribute values containing literal "||".
                x = x.split(u'||')
                
                t[k] = u''
                
                # Loop through each table cell
                for theline in x:
                    z = ''
                    if fc != u'+':
                        tra = ltr.pop()
                        if not tr.pop():
                            z = u'<tr' + tra + u'>\n'
                        tr.append(True)
                        ltr.append(u'')
                        has_opened_tr.pop()
                        has_opened_tr.append(True)
                    l = ltd.pop()
                    if td.pop():
                        z = u'</' + l + u'>' + z
                    if fc == u'|':
                        l = u'td'
                    elif fc == u'!':
                        l = u'th'
                    elif fc == u'+':
                        l = u'caption'
                    else:
                        l = u''
                    ltd.append(l)
                    
                    #Cell parameters
                    y = theline.split(u'|', 1)
                    # Note that a '|' inside an invalid link should not
                    # be mistaken as delimiting cell parameters
                    if y[0].find(u'[[') != -1:
                        y = [theline]
                        
                    if len(y) == 1:
                        y = z + u"<" + l + u">" + y[0]
                    else:
                        attributes = self.unstripForHTML(y[0], state)
                        y = z + u"<" + l + self.fixTagAttributes(attributes, l) + u">" + y[1]
                    
                    t[k] += y
                    td.append(True)
        
        while len(td) > 0:
            l = ltd.pop()
            if td.pop():
                t.append(u'</td>')
            if tr.pop():
                t.append(u'</tr>')
            if not has_opened_tr.pop():
                t.append(u'<tr><td></td></tr>')
            t.append(u'</table>')
        
        text = u'\n'.join(t)
        # special case: don't return empty table
        if text == u"<table>\n<tr><td></td></tr>\n</table>":
            text = u''
        
        return text

    def unstripForHTML(self, text, state):
        text = self.unstrip(text, state)
        text = self.unstripNoWiki(text, state)
        return text

    def unstrip(self, text, state):
        if 'general' not in state:
            return text

        general = state['general']
        for k in general:
            v = general[k]
            text = text.replace(k, v)
        return text

    def unstripNoWiki(self, text, state):
        if 'nowiki' not in state:
            return text
        nowiki = state['nowiki']
        for k in nowiki:
            v = nowiki[k]
            text = text.replace(k, v)
        return text

    _headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)</[Hh][1-6] *>", re.UNICODE)
    _templateSectionPat = re.compile(ur"<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->", re.UNICODE)
    _tagPat = re.compile(ur"<.*?>", re.UNICODE)
    def formatHeadings(self, text, isMain, showToc, state):
        """
        This function accomplishes several tasks:
        1) Auto-number headings if that option is enabled
        2) Add an [edit] link to sections for logged in users who have enabled the option
        3) Add a Table of contents on the top for users who have enabled the option
        4) Auto-anchor headings
        
        It loops through all headlines, collects the necessary data, then splits up the
        string and re-inserts the newly formatted headlines.
        """
        doNumberHeadings = False
        showEditLink = True # Can User Edit

        if text.find(u"__NOEDITSECTION__") != -1:
            showEditLink = False
            text = text.replace(u"__NOEDITSECTION__", u"")

        # Get all headlines for numbering them and adding funky stuff like [edit]
        # links - this is for later, but we need the number of headlines right now
        matches = self._headerPat.findall(text)
        numMatches = len(matches)

        # if there are fewer than 4 headlines in the article, do not show TOC
        # unless it's been explicitly enabled.
        enoughToc = showToc and (numMatches >= 4 or text.find(u"<!--MWTOC-->") != -1)
        
        # Allow user to stipulate that a page should have a "new section"
        # link added via __NEWSECTIONLINK__
        showNewSection = False
        if text.find(u"__NEWSECTIONLINK__") != -1:
            showNewSection = True
            text = text.replace(u"__NEWSECTIONLINK__", u"")
        # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
        # override above conditions and always show TOC above first header
        if text.find(u"__FORCETOC__") != -1:
            showToc = True
            enoughToc = True
            text = text.replace(u"__FORCETOC__", u"")
        # Never ever show TOC if no headers
        if numMatches < 1:
            enoughToc = False

        # headline counter
        headlineCount = 0
        sectionCount = 0 # headlineCount excluding template sections

        # Ugh .. the TOC should have neat indentation levels which can be
        # passed to the skin functions. These are determined here
        toc = []
        head = {}
        sublevelCount = {}
        levelCount = {}
        toclevel = 0
        level = 0
        prevlevel = 0
        toclevel = 0
        prevtoclevel = 0
        refers = {}
        refcount = {}
        wgMaxTocLevel = 5
        
        for match in matches:
            headline = match[2]
            istemplate = False
            templatetitle = u''
            templatesection = 0
            numbering = []
            
            m = self._templateSectionPat.search(headline)
            if m:
                istemplate = True
                templatetitle = b64decode(m[0])
                templatesection = 1 + int(b64decode(m[1]))
                headline = self._templateSectionPat.sub(u'', headline)
            
            if toclevel:
                prevlevel = level
                prevtoclevel = toclevel
            
            level = matches[headlineCount][0]
            
            if doNumberHeadings or enoughToc:
                if level > prevlevel:
                    toclevel += 1
                    sublevelCount[toclevel] = 0
                    if toclevel < wgMaxTocLevel:
                        toc.append(u'\n<ul>')
                elif level < prevlevel and toclevel > 1:
                    # Decrease TOC level, find level to jump to
                    
                    if toclevel == 2 and level < levelCount[1]:
                        toclevel = 1
                    else:
                        for i in range(toclevel, 0, -1):
                            if levelCount[i] == level:
                                # Found last matching level
                                toclevel = i
                                break
                            elif levelCount[i] < level:
                                toclevel = i + 1
                                break
                    if toclevel < wgMaxTocLevel:
                        toc.append(u"</li>\n")
                        toc.append(u"</ul>\n</li>\n" * max(prevtoclevel - toclevel, 0))
                else:
                    if toclevel < wgMaxTocLevel:
                        toc.append(u"</li>\n")
                
                levelCount[toclevel] = level
                
                # count number of headlines for each level
                sublevelCount[toclevel] += 1
                for i in range(1, toclevel+1):
                    if sublevelCount[i]:
                        numbering.append(self.to_unicode(sublevelCount[i]))
            
            # The canonized header is a version of the header text safe to use for links
            # Avoid insertion of weird stuff like <math> by expanding the relevant sections
            canonized_headline = self.unstrip(headline, state)
            canonized_headline = self.unstripNoWiki(canonized_headline, state)
            
            # -- don't know what to do with this yet.
            # Remove link placeholders by the link text.
            #    <!--LINK number-->
            # turns into
            #    link text with suffix
    #       $canonized_headline = preg_replace( '/<!--LINK ([0-9]*)-->/e',
    #               "\$this->mLinkHolders['texts'][\$1]",
    #               $canonized_headline );
    #       $canonized_headline = preg_replace( '/<!--IWLINK ([0-9]*)-->/e',
    #               "\$this->mInterwikiLinkHolders['texts'][\$1]",
    #               $canonized_headline );

            # self.strip out HTML
            canonized_headline = self._tagPat.sub(u'', canonized_headline)
            tocline = canonized_headline.strip()
            # Save headline for section edit hint before it's escaped
            headline_hint = tocline
            canonized_headline = self.escapeId(tocline)
            refers[headlineCount] = canonized_headline

            # count how many in assoc. array so we can track dupes in anchors
            if canonized_headline not in refers:
                refers[canonized_headline] = 1
            else:
                refers[canonized_headline] += 1
            refcount[headlineCount] = refers[canonized_headline]
            
            numbering = '.'.join(numbering)
            
            # Don't number the heading if it is the only one (looks silly)
            if doNumberHeadings and numMatches > 1:
                # the two are different if the line contains a link
                headline = numbering + u' ' + headline

            # Create the anchor for linking from the TOC to the section
            anchor = canonized_headline;
            if refcount[headlineCount] > 1:
                anchor += u'_' + unicode(refcount[headlineCount])
            
            if enoughToc:
                toc.append(u'\n<li class="toclevel-')
                toc.append(self.to_unicode(toclevel))
                toc.append(u'"><a href="#w_')
                toc.append(anchor)
                toc.append(u'"><span class="tocnumber">')
                toc.append(numbering)
                toc.append(u'</span> <span class="toctext">')
                toc.append(tocline)
                toc.append(u'</span></a>')
            
    #       if showEditLink and (not istemplate or templatetitle != u""):
    #       if not head[headlineCount]:
    #           head[headlineCount] = u''
    #       
    #       if istemplate:
    #           head[headlineCount] += sk.editSectionLinkForOther(templatetile, templatesection)
    #       else:
    #           head[headlineCount] += sk.editSectionLink(mTitle, sectionCount+1, headline_hint)
            
            # give headline the correct <h#> tag
            if headlineCount not in head:
                head[headlineCount] = []
            h = head[headlineCount]
            h.append(u'<h')
            h.append(self.to_unicode(level))
            h.append(u' id="w_')
            h.append(anchor)
            h.append('">')
            h.append(matches[headlineCount][1].strip())
            h.append(headline.strip())
            h.append(u'</h')
            h.append(self.to_unicode(level))
            h.append(u'>')
            
            headlineCount += 1

            if not istemplate:
                sectionCount += 1
            
        if enoughToc:
            if toclevel < wgMaxTocLevel:
                toc.append(u"</li>\n")
                toc.append(u"</ul>\n</li>\n" * max(0, toclevel - 1))
            toc.insert(0, u'<div id="toc"><h2>Table of Contents</h2>')
            toc.append(u'</ul>\n</div>')

        # split up and insert constructed headlines
        
        blocks = self._headerPat.split(text)
        
        i = 0
        len_blocks = len(blocks)
        forceTocPosition = text.find(u"<!--MWTOC-->")
        full = []
        while i < len_blocks:
            j = i/4
            full.append(blocks[i])
            if enoughToc and not i and isMain and forceTocPosition == -1:
                full += toc
                toc = None
            if j in head and head[j]:
                full += head[j]
                head[j] = None
            i += 4
        full = u''.join(full)
        if forceTocPosition != -1:
            return full.replace(u"<!--MWTOC-->", u''.join(toc), 1)
        else:
            return full

    _startRegexHash = {}
    _endRegexHash = {}
    _endCommentPat = re.compile(ur'(-->)', re.UNICODE)
    _extractTagsAndParams_n = 1
    def extractTagsAndParams(self, elements, text, matches, uniq_prefix = u''):
        """
        Replaces all occurrences of HTML-style comments and the given tags
        in the text with a random marker and returns teh next text. The output
        parameter $matches will be an associative array filled with data in
        the form:
          'UNIQ-xxxxx' => array(
          'element',
          'tag content',
          array( 'param' => 'x' ),
          '<element param="x">tag content</element>' ) )
        """
        stripped = u''
        
        taglist = u'|'.join(elements)
        if taglist not in self._startRegexHash:
            self._startRegexHash[taglist] = re.compile(ur"<(" + taglist + ur")(\s+[^>]*?|\s*?)(/?>)|<(!--)", re.UNICODE | re.IGNORECASE)
        start = self._startRegexHash[taglist]
        
        while text != u'':
            p = start.split(text, 1)
            stripped += p[0]
            if len(p) == 1:
                break
            elif p[4]:
                # comment
                element = p[4]
                attributes = u''
                close = u''
            else:
                element = p[1]
                attributes = p[2]
                close = p[3]
            inside = p[5]
            
            self._extractTagsAndParams_n
            marker = uniq_prefix + u'-' + element + u'-' + (u"%08X" % self._extractTagsAndParams_n) + u'-QINU'
            self._extractTagsAndParams_n += 1
            stripped += marker
            
            if close == u'/>':
                # empty element tag, <tag />
                content = None
                text = inside
                tail = None
            else:
                if element == u'!--':
                    end = self._endCommentPat
                else:
                    if element not in self._endRegexHash:
                        self._endRegexHash[element] = re.compile(ur'(</' + element + ur'\s*>)', re.UNICODE | re.IGNORECASE)
                    end = self._endRegexHash[element]
                q = end.split(inside, 1)
                content = q[0]
                if len(q) < 3:
                    # no end tag
                    tail = ''
                    text = ''
                else:
                    tail = q[1]
                    text = q[2]
            
            matches[marker] = (
                element,
                content,
                self.decodeTagAttributes(attributes),
                u"<" + element + attributes + close + content + tail
            )
        return stripped

    def strip(self, text, state, uniq_prefix, stripcomments = False, dontstrip = []):
        render = True

        commentState = {}
        
        elements = ['nowiki',]  + self.mTagHooks.keys()
        if True: #wgRawHtml
            elements.append('html')
        
        # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
        for k in dontstrip:
            if k in elements:
                del elements[k]
        
        matches = {}
        text = self.extractTagsAndParams(elements, text, matches, uniq_prefix)
        
        for marker in matches:
            element, content, params, tag = matches[marker]
            if render:
                tagName = element.lower()
                if tagName == u'!--':
                    # comment
                    output = tag
                    if tag[-3:] != u'-->':
                        output += "-->"
                elif tagName == u'html':
                    output = content
                elif tagName == u'nowiki':
                    output = content.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
                else:
                    if tagName in self.mTagHooks:
                        output = self.mTagHooks[tagName](content, params)
                    else:
                        output = content.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
            else:
                # Just stripping tags; keep the source
                output = tag

            # Unstrip the output, because self.unstrip() is no longer recursive so 
            # it won't do it itself
            output = self.unstrip(output, state)
            
            if not stripcomments and element == u'!--':
                commentState[marker] = output
            elif element == u'html' or element == u'nowiki':
                if 'nowiki' not in state:
                    state['nowiki'] = {}
                state['nowiki'][marker] = output
            else:
                if 'general' not in state:
                    state['general'] = {}
                state['general'][marker] = output

        # Unstrip comments unless explicitly told otherwise.
        # (The comments are always stripped prior to this point, so as to
        # not invoke any extension tags / parser hooks contained within
        # a comment.)
        if not stripcomments:
            # Put them all back and forget them
            for k in commentState:
                v = commentState[k]
                text = text.replace(k, v)
        
        return text

    mArgStack = []

    def replaceVariables(self, text, args = {}, argsOnly = False):
        """
        Replace magic variables, templates, and template arguments
        with the appropriate text. Templates are substituted recursively,
        taking care to avoid infinite loops.
        """
        return text
        # Prevent too big inclusions
    #   if( strlen( $text ) > $this->mOptions->getMaxIncludeSize() ) {
    #       return $text;
    #   }

        # This function is called recursively. To keep track of arguments we need a stack:
        self.mArgStack.append(args)
        
        braceCallbacks = {}
        if not argsOnly:
            braceCallbacks[2] = [None, self.braceSubstitution]
        braceCallbacks[3] = [None, argSubstitution]
        
        callbacks = {
            u'{': {
                'end': u'}',
                'cb': braceCallbacks,
                'min': argsOnly and 3 or 2,
                'max': 3
            },
            u'[': {
                'end': u']',
                'cb': {2: None},
                'min': 2,
                'max': 2
            }
        }
        text = self.replace_callback(text, callbacks)
        self.mArgStack.pop()
        
        return text

    def replace_callback(self, text, callbacks):
        """
        self.parse any parentheses in format ((title|part|part))
        and call callbacks to get a replacement text for any found piece
        """
        openingBraceStack = []    # this array will hold a stack of parentheses which are not closed yet
        lastOpeningBrace = -1      # last not closed parentheses

        validOpeningBraces = u''.join(callbacks.keys())
        
        i = 0
        while i < len(text):
            if lastOpeningBrace == -1:
                currentClosing = u''
                search = validOpeningBraces
            else:
                currentClosing = openingBraceStack[lastOpeningBrace]['braceEnd']
                search = validOpeningBraces + u'|' + currentClosing
            rule = None
            pos = -1
            for c in search:
                pos = max(pos, text.find(c, i))
            pos -= i
            pos += 1
            if pos == 0:
                pos = len(text)-i
            i += pos
            if i < len(text):
                if text[i] == u'|':
                    found = 'pipe'
                elif text[i] == currentClosing:
                    found = 'close'
                elif text[i] in callbacks:
                    found = 'open'
                    rule = callbacks[text[i]]
                else:
                    i += 1
                    continue
            else:
                break
            
            if found == 'open':
                # found opening brace, let's add it to parentheses stack
                piece = {
                    'brace': text[i],
                    'braceEnd': rule['end'],
                    'title': u'',
                    'parts': None
                }

                # count opening brace characters
                count = 0
                while True:
                    if text[i+count:i+1+count] == piece['brace']:
                        count += 1
                    else:
                        break
                piece['count'] = count
                i += piece['count']
                piece['startAt'] = piece['partStart'] = i

                # we need to add to stack only if opening brace count is enough for one of the rules
                if piece['count'] >= rule['min']:
                    lastOpeningBrace += 1
                    openingBraceStack[lastOpeningBrace] = piece
            elif found == 'close':
                maxCount = openingBraceStack[lastOpeningBrace]['count']
                count = 0
                while count < maxCount:
                    if text[i+count:i+1+count] == text[i]:
                        count += 1
                    else:
                        break
                
                # check for maximum matching characters (if there are 5 closing 
                # characters, we will probably need only 3 - depending on the rules)
                matchingCount = 0
                matchingCallback = None
                cbType = callbacks[openingBraceStack[lastOpeningBrace]['brace']]
                if count > cbType['max']:
                    # The specified maximum exists in the callback array, unless the caller 
                    # has made an error
                    matchingCount = cbType['max']
                else:
                    # Count is less than the maximum
                    # Skip any gaps in the callback array to find the true largest match
                    # Need to use array_key_exists not isset because the callback can be null
                    matchingCount = count
                    while matchingCount > 0 and matchingCount not in cbType['cb']:
                        matchingCount -= 1
                
                if matchingCount <= 0:
                    i += count
                    continue
                matchingCallback = cbType['cb'][matchingCount]
                
                # let's set a title or last part (if '|' was found)
                if openingBraceStack[lastOpeningBrace]['parts'] is None:
                    openingBraceStack[lastOpeningBrace]['title'] = \
                        text[openingBraceStack[lastOpeningBrace]['partStart']:i]
                else:
                    openingBraceStack[lastOpeningBrace]['parts'].append( 
                        text[openingBraceStack[lastOpeningBrace]['partStart']:i]
                    )

                pieceStart = openingBraceStack[lastOpeningBrace]['startAt'] - matchingCount
                pieceEnd = i + matchingCount
                
                if callable(matchingCallback):
                    cbArgs = {
                        'text': text[pieceStart:pieceEnd],
                        'title': openingBraceStack[lastOpeningBrace]['title'].strip(),
                        'parts': openingBraceStack[lastOpeningBrace]['parts'],
                        'lineStart': pieceStart > 0 and text[pieceStart-1] == u"\n"
                    }
                    # finally we can call a user callback and replace piece of text
                    replaceWith = matchingCallback(cbArgs)
                    text = text[:pieceStart] + replaceWith + text[pieceEnd:]
                    i = pieceStart + len(replaceWith)
                else:
                    # null value for callback means that parentheses should be parsed, but not replaced
                    i += matchingCount
                
                # reset last opening parentheses, but keep it in case there are unused characters
                piece = {
                    'brace': openingBraceStack[lastOpeningBrace]['brace'],   
                    'braceEnd': openingBraceStack[lastOpeningBrace]['braceEnd'],
                    'count': openingBraceStack[lastOpeningBrace]['count'],
                    'title': u'',
                    'parts': None,
                    'startAt': openingBraceStack[lastOpeningBrace]['startAt']
                }
                openingBraceStack[lastOpeningBrace] = None
                lastOpeningBrace -= 1
                
                if matchingCount < piece['count']:
                    piece['count'] -= matchingCount
                    piece['startAt'] -= matchingCount
                    piece['partStart'] = piece['startAt']
                    # do we still qualify for any callback with remaining count?
                    currentCbList = callbacks[piece['brace']]['cb']
                    while piece['count']:
                        if piece['count'] in currentCbList:
                            lastOpeningBrace += 1
                            openingBraceStack[lastOpeningBrace] = piece
                            break
                        
                        piece['count'] -= 1
            
            elif found == 'pipe':
                # lets set a title if it is a first separator, or next part otherwise
                if opeingBraceStack[lastOpeningBrace]['parts'] is None:
                    openingBraceStack[lastOpeningBrace]['title'] = \
                        text[openingBraceStack[lastOpeningBrace]['partStart']:i]
                    openingBraceStack[lastOpeningBrace]['parts'] = []
                else:
                    openingBraceStack[lastOpeningBrace]['parts'].append(
                        text[openingBraceStack[lastOpeningBrace]['partStart']:i]
                    )
                i += 1
                openingBraceStack[lastOpeningBrace]['partStart'] = i

        return text

    def braceSubstitution(self, piece):
        """
        Return the text of a template, after recursively
        replacing any variables or templates within the template.
        """
    #   global $wgContLang, $wgLang, $wgAllowDisplayTitle, $action;

        # Flags
        found = False        # $text has been filled
        nowiki = False      # wiki markup in $text should be escaped
        noparse = False    # Unsafe HTML tags should not be stripped, etc.
        noargs = False      # Don't replace triple-brace arguments in $text
        replaceHeadings = False   # Make the edit section links go to the template not the article
        isHTML = False      # text is HTML, armour it against wikitext transformation
        forceRawInterwiki = False # Force interwiki transclusion to be done in raw mode not rendered

        # Title object, where $text came from
        title = None

        linestart = '';

            
        # part1 is the bit before the first |, and must contain only title characters
        # args is a list of arguments, starting from index 0, not including $part1

        titleText = part1 = piece['title']
        # If the third subpattern matched anything, it will start with |

        if piece['parts'] is None:
            replaceWith = variableSubstitution([piece['text'], piece['title']])
            if replaceWith != piece['text']:
                text = replaceWith
                found = True
                noparse = True
                noargs = True
        
        args = piece['parts'] is None and [] or piece['parts']
        argc = len(args)

        # SUBST
        if not found:
            mwSubst = u"SUBST"
            if part1.find(mwSubst) != -1:
                # One of two possibilities is true:
                # 1) Found SUBST but not in the PST phase
                # 2) Didn't find SUBST and in the PST phase
                # In either case, return without further processing
                part1.replace(mwSubst, u'', 1)
                text = piece['text']
                found = True
                noparse = True
                noargs = True

        # MSG, MSGNW and RAW
        if not found:
            # Check for MSGNW:
            mwMsgnw = u"MSGNW"
            if part1.find(mwMsgnw) != -1:
                part1.replace(mwMsgnw, u'', 1)
                nowiki = True
            else:
                mwMsg = u"MSG"
                part1.replace(mwMsg, u'', 1)
            
            # Check for RAW:
            mwRaw = u"RAW"
            if part1.find(mwRaw) != -1:
                part1.replace(mwRaw, u'', 1)
                forceRawInterwiki = True
        
        # Parser functions
        if not found:
            colonPos = part1.find(u':')
            if colonPos != -1:
                # Case sensitive functions
                function = part1[0:colonPos]
                if function in self.mFunctionSynonyms[1]:
                    function = self.mFunctionSynonyms[1][function]
                else:
                    # Case insensitive functions
                    function = function.lower()
                    if function in self.mFunctionSynonyms[0]:
                        function = self.mFunctionSynonyms[0][function]
                    else:
                        function = False
                
                if function:
                    funcArgs = [x.strip() for x in args]
                    funcArgs += [None, part1[colonPos+1:].strip()]
                    result = self.mFunctionHooks[function](*funcArgs)
                    found = True

                    # The text is usually already parsed, doesn't need triple-brace tags expanded, etc.
                    #$noargs = true;
                    #$noparse = true;
                    
                    if isinstance(result, dict):
                        if 0 in result:
                            tex = linestart + list[0]
                            del list[0]
                        
                        # Extract flags into the local scope
                        # This allows callers to set flags such as nowiki, noparse, found, etc.
                        if 'nowiki' in result:
                            nowiki = result['nowiki']
                        if 'noparse' in result:
                            noparse = result['noparse']
                        if 'found' in result:
                            found = result['found']
                    else:
                        text = linestart + result

        # Template table test

        # Did we encounter this template already? If yes, it is in the cache
        # and we need to check for loops.
        if not found and piece['title'] in mTemplates:
            found = True

            # Infinite loop test
            if part1 in mTemplatePath:
                noparse = True
                noargs = True
                found = True
                text = linestart + u"[[" + part1 + u"]]<!-- WARNING: template loop detected -->"
            else:
                text = linestart + mTemplates[piece['title']]
        
        # Load from database
        lastPathLevel = mTemplatePath
        if not found:
            ns = NS_TEMPLATE;
            # declaring $subpage directly in the function call
            # does not work correctly with references and breaks
            # {{/subpage}}-style inclusions
            subpage = u''
            part1 = maybeDoSubpageLink(part1, subpage)
            if subpage != u'':
                ns = mTitle.getNamespace()
            title = Title.newFromText(part1, ns)
            
            if title is not None:
                titleText = title.getPrefixedText()
                checkVariantLink = len(wgContLang.getVariants()) > 1
                # Check for language variants if the template is not found
                if checkVariantLink and title.getArticleID() == 0:
                    wgContLang.findVariantLink(part1, title)
                if title.isExternal():
                    if title.getNamespace() == u"Special" and mOptions.getAllowSpecialInclusion():
                        text = SpecialPage.capturePath(title)
                        if isinstance(text, basestring):
                            found = True
                            noparse = True
                            noargs = True
                            isHTML = True
                            this.disableCache()
                    else:
                        articleContent = fetchTemplate(title)
                        if articleContent != False:
                            found = True
                            text = articleContent
                            replaceHeadings = true
                    
                    # If the title is valid but undisplayable, make a link to it
                    if not found:
                        text = u"[[:" + titleText + u"]]"
                        found = True
                elif title.isTrans():
                    pass
    #           # Interwiki transclusion
    #           if ( $this->ot['html'] && !$forceRawInterwiki ) {
    #           $text = $this->interwikiTransclude( $title, 'render' );
    #           $isHTML = true;
    #           $noparse = true;
    #           } else {
    #           $text = $this->interwikiTransclude( $title, 'raw' );
    #           $replaceHeadings = true;
    #           }
    #           $found = true;
    #       }

                # Template cache array insertion
                # Use the original $piece['title'] not the mangled $part1, so that
                # modifiers such as RAW: produce separate cache entries
                if found:
                    if isHTML:
                        pass # A special page; don't store it in the template cache.
                    else:
                        mTemplates[place['title']] = text
                    text = linestart + text


    _guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
    _guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
    def fixtags(self, text):
        """Clean up special characters, only run once, next-to-last before self.doBlockLevels"""
        # french spaces, last one Guillemet-left
        # only if there is something before the space
        text = self._guillemetLeftPat.sub(ur'\1&nbsp;\2', text)
        # french spaces, Guillemet-right
        text = self._guillemetRightPat.sub(ur'\1&nbsp;', text)
        return text

    def closeParagraph(self, mLastSection):
        """Used by self.doBlockLevels()"""
        result = u''
        if mLastSection != u'':
            result = u'</' + mLastSection + u'>\n'
        
        return result

    def getCommon(self, st1, st2):
        """
        self.getCommon() returns the length of the longest common substring
        of both arguments, starting at the beginning of both.
        """
        fl = len(st1)
        shorter = len(st2)
        if fl < shorter:
            shorter = fl
        
        i = 0
        while i < shorter:
            if st1[i] != st2[i]:
                break
            i += 1
        return i

    def openList(self, char, mLastSection):
        """
        These next three functions open, continue, and close the list
        element appropriate to the prefix character passed into them.
        """
        result = self.closeParagraph(mLastSection)
        
        mDTopen = False
        if char == u'*':
            result += u'<ul><li>'
        elif char == u'#':
            result += u'<ol><li>'
        elif char == u':':
            result += u'<dl><dd>'
        elif char == u';':
            result += u'<dl><dt>'
            mDTopen = True
        else:
            result += u'<!-- ERR 1 -->'
        
        return result, mDTopen

    def nextItem(self, char, mDTopen):
        if char == u'*' or char == '#':
            return u'</li><li>', None
        elif char == u':' or char == u';':
            close = u'</dd>'
            if mDTopen:
                close = '</dt>'
            if char == u';':
                return close + u'<dt>', True
            else:
                return close + u'<dd>', False
        return u'<!-- ERR 2 -->'

    def closeList(self, char, mDTopen):
        if char == u'*':
            return u'</li></ul>\n'
        elif char == u'#':
            return u'</li></ol>\n'
        elif char == u':':
            if mDTopen:
                return u'</dt></dl>\n'
            else:
                return u'</dd></dl>\n'
        else:
            return u'<!-- ERR 3 -->'

    _closePrePat = re.compile(u"</pre", re.UNICODE | re.IGNORECASE)
    _openPrePat = re.compile(u"<pre", re.UNICODE | re.IGNORECASE)
    _openMatchPat = re.compile(u"(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<ol|<li|</center|</tr|</td|</th)", re.UNICODE | re.IGNORECASE)

    def findColonNoLinks(self, text, before, after):
        try:
            pos = text.search(':')
        except:
            return False

        lt = text.find('<')
        if lt == -1 or lt > pos:
            # Easy; no tag nesting to worry about
            before = text[0:pos]
            after = text[0:pos+1]
            return before, after, pos

        # Ugly state machine to walk through avoiding tags.
        state = self.MW_COLON_STATE_TEXT;
        stack = 0;
        i = 0
        while i < len(text):
            c = text[i];

            if state == 0: # self.MW_COLON_STATE_TEXT:
                if text[i] == '<':
                    # Could be either a <start> tag or an </end> tag
                    state = self.MW_COLON_STATE_TAGSTART
                elif text[i] == ':':
                    if stack == 0:
                        # we found it
                        return text[0:i], text[i+1], i
                else:
                    # Skip ahead looking for something interesting
                    try:
                        colon = text.search(':', i)
                    except:
                        return False
                    lt = text.find('<', i)
                    if stack == 0:
                        if lt == -1 or colon < lt:
                            # we found it
                            return text[0:colon], text[colon+1], i
                    if lt == -1:
                        break
                    # Skip ahead to next tag start
                    i = lt
                    state = self.MW_COLON_STATE_TAGSTART
            elif state == 1: # self.MW_COLON_STATE_TAG:
                # In a <tag>
                if text[i] == '>':
                    stack += 1
                    state = self.MW_COLON_STATE_TEXT
                elif text[i] == '/':
                    state = self.MW_COLON_STATE_TAGSLASH
            elif state == 2: # self.MW_COLON_STATE_TAGSTART:
                if text[i] == '/':
                    state = self.MW_COLON_STATE_CLOSETAG
                elif text[i] == '!':
                    state = self.MW_COLON_STATE_COMMENT
                elif text[i] == '>':
                    # Illegal early close? This shouldn't happen D:
                    state = self.MW_COLON_STATE_TEXT
                else:
                    state = self.MW_COLON_STATE_TAG
            elif state == 3: # self.MW_COLON_STATE_CLOSETAG:
                # In a </tag>
                if text[i] == '>':
                    stack -= 1
                    if stack < 0:
                        return False
                    state = self.MW_COLON_STATE_TEXT
            elif state == self.MW_COLON_STATE_TAGSLASH:
                if text[i] == '>':
                    # Yes, a self-closed tag <blah/>
                    state = self.MW_COLON_STATE_TEXT
                else:
                    # Probably we're jumping the gun, and this is an attribute
                    state = self.MW_COLON_STATE_TAG
            elif state == 5: # self.MW_COLON_STATE_COMMENT:
                if text[i] == '-':
                    state = self.MW_COLON_STATE_COMMENTDASH
            elif state == self.MW_COLON_STATE_COMMENTDASH:
                if text[i] == '-':
                    state = self.MW_COLON_STATE_COMMENTDASHDASH
                else:
                    state = self.MW_COLON_STATE_COMMENT
            elif state == self.MW_COLON_STATE_COMMENTDASHDASH:
                if text[i] == '>':
                    state = self.MW_COLON_STATE_TEXT
                else:
                    state = self.MW_COLON_STATE_COMMENT
            else:
                raise
        if stack > 0:
            return False
        return False

    def doBlockLevels(self, text, linestart, mUniqPrefix):
        # Parsing through the text line by line.  The main thing
        # happening here is handling of block-level elements p, pre,
        # and making lists from lines starting with * # : etc.
        lastPrefix = u''
        mDTopen = inBlockElem = False
        prefixLength = 0
        paragraphStack = False
        _closeMatchPat = re.compile(ur"(</table|</blockquote|</h1|</h2|</h3|</h4|</h5|</h6|<td|<th|<div|</div|<hr|</pre|</p|" +  mUniqPrefix + ur"-pre|</li|</ul|</ol|<center)", re.UNICODE | re.IGNORECASE)
        mInPre = False
        mLastSection = u''
        mDTopen = False
        output = []
        for oLine in text.split('\n')[not linestart and 1 or 0:]:
            lastPrefixLength = len(lastPrefix)
            preCloseMatch = self._closePrePat.search(oLine)
            preOpenMatch = self._openPrePat.search(oLine)
            if not mInPre:
                chars = u'*#:;'
                prefixLength = 0
                for c in oLine:
                    if c in chars:
                        prefixLength += 1
                    else:
                        break
                pref = oLine[0:prefixLength]
                
                # eh?
                pref2 = pref.replace(u';', u':')
                t = oLine[prefixLength:]
                mInPre = bool(preOpenMatch)
            else:
                # Don't interpret any other prefixes in preformatted text
                prefixLength = 0
                pref = pref2 = u''
                t = oLine

            # List generation
            if prefixLength and lastPrefix == pref2:
                # Same as the last item, so no need to deal with nesting or opening stuff
                tmpOutput, tmpMDTopen = self.nextItem(pref[-1:], mDTopen)
                output.append(tmpOutput)
                if tmpMDTopen is not None:
                    mDTopen = tmpMDTopen
                paragraphStack = False
                
                if pref[-1:] == u';':
                    # The one nasty exception: definition lists work like this:
                    # ; title : definition text
                    # So we check for : in the remainder text to split up the
                    # title and definition, without b0rking links.
                    term = t2 = u''
                    z = self.findColonNoLinks(t, term, t2)
                    if z != False:
                        term, t2 = z[1:2]
                        t = t2
                        output.append(term)
                        tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen)
                        output.append(tmpOutput)
                        if tmpMDTopen is not None:
                            mDTopen = tmpMDTopen
            
            elif prefixLength or lastPrefixLength:
                # Either open or close a level...
                commonPrefixLength = self.getCommon(pref, lastPrefix)
                paragraphStack = False
                while commonPrefixLength < lastPrefixLength:
                    tmp = self.closeList(lastPrefix[lastPrefixLength-1], mDTopen)
                    output.append(tmp)
                    mDTopen = False
                    lastPrefixLength -= 1
                if prefixLength <= commonPrefixLength and commonPrefixLength > 0:
                    tmpOutput, tmpMDTopen = self.nextItem(pref[commonPrefixLength-1], mDTopen)
                    output.append(tmpOutput)
                    if tmpMDTopen is not None:
                        mDTopen = tmpMDTopen

                while prefixLength > commonPrefixLength:
                    char = pref[commonPrefixLength:commonPrefixLength+1]
                    tmpOutput, tmpMDTOpen = self.openList(char, mLastSection)
                    if tmpMDTOpen:
                        mDTopen = True
                    output.append(tmpOutput)
                    mLastSection = u''
                    mInPre = False
                    
                    if char == u';':
                        # FIXME: This is dupe of code above
                        term = t2 = u''
                        z = self.findColonNoLinks(t, term, t2)
                        if z != False:
                            term, t2 = z[1:2]
                            t = t2
                            output.append(term)
                            tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen)
                            output.append(tmpOutput)
                            if tmpMDTopen is not None:
                                mDTopen = tmpMDTopen

                    commonPrefixLength += 1
                
                lastPrefix = pref2
            
            if prefixLength == 0:
                # No prefix (not in list)--go to paragraph mode
                # XXX: use a stack for nestable elements like span, table and div
                openmatch = self._openMatchPat.search(t)
                closematch = _closeMatchPat.search(t)
                if openmatch or closematch:
                    paragraphStack = False
                    output.append(self.closeParagraph(mLastSection))
                    mLastSection = u''
                    mInPre = False
                    if preOpenMatch and preCloseMatch:
                        mInPre = True
                    if closematch:
                        inBlockElem = False
                    else:
                        inBlockElem = True
                elif not inBlockElem and not mInPre:
                    if t[0:1] == u' ' and (mLastSection ==  u'pre' or t.strip() != u''):
                        # pre
                        if mLastSection != u'pre':
                            paragraphStack = False
                            output.append(self.closeParagraph(u'') + u'<pre>')
                            mInPre = False
                            mLastSection = u'pre'
                        t = t[1:]
                    else:
                        # paragraph
                        if t.strip() == u'':
                            if paragraphStack:
                                output.append(paragraphStack + u'<br />')
                                paragraphStack = False
                                mLastSection = u'p'
                            else:
                                if mLastSection != u'p':
                                    output.append(self.closeParagraph(mLastSection))
                                    mLastSection = u''
                                    mInPre = False
                                    paragraphStack = u'<p>'
                                else:
                                    paragraphStack = u'</p><p>'
                        else:
                            if paragraphStack:
                                output.append(paragraphStack)
                                paragraphStack = False
                                mLastSection = u'p'
                            elif mLastSection != u'p':
                                output.append(self.closeParagraph(mLastSection) + u'<p>')
                                mLastSection = u'p'
                                mInPre = False
            
            # somewhere above we forget to get out of pre block (bug 785)
            if preCloseMatch and mInPre:
                mInPre = False
            
            if paragraphStack == False:
                output.append(t + u"\n")
        
        while prefixLength:
            output.append(self.closeList(pref2[prefixLength-1], mDTopen))
            mDTopen = False
            prefixLength -= 1
        
        if mLastSection != u'':
            output.append(u'</' + mLastSection + u'>')
            mLastSection = u''
        
        return ''.join(output)

    def parse(self, text, showToc=True):
        """docstring for self.parse"""
        utf8 = isinstance(text, str)
        text = self.to_unicode(text)
        if text[-1:] != u'\n':
            text = text + u'\n'
            taggedNewline = True
        else:
            taggedNewline = False
        mStripState = {}
        mUniqPrefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000))

        text = self.strip(text, mStripState, mUniqPrefix)
        text = self.removeHtmlTags(text)
        text = self.replaceVariables(text)
        text = self.doTableStuff(text, mStripState)
        text = self.parseHorizontalRule(text)
        text, toc = self.checkTOC(text)
        text = self.parseHeaders(text)
        text = self.parseAllQuotes(text)
        text = self.replaceInternalLinks(text)
        text = self.replaceExternalLinks(text)
        if not toc and text.find(u"<!--MWTOC-->") == -1:
            showToc = False
        text = self.formatHeadings(text, True, showToc, mStripState)
        text = self.unstrip(text, mStripState)
        text = self.fixtags(text)
        text = self.doBlockLevels(text, True, mUniqPrefix)
        text = self.unstripNoWiki(text, mStripState)
        if taggedNewline and text[-1:] == u'\n':
            text = text[:-1]
        if utf8:
            return text.encode("utf-8")
        return text

    def truncate_url(self, url, length=40):
        if len(url) <= length:
            return url
        import re
        pattern = r'(/[^/]+/?)$'
        match = re.search(pattern, url)
        if not match:
            return url
        l = len(match.group(1))
        domain = url.replace(match.group(1), '')
        firstpart = url[0:len(url)-l]
        secondpart = match.group(1)
        if firstpart == firstpart[0:length-3]:
            secondpart = secondpart[0:length-3] + '...'
        else:
            firstpart = firstpart[0:length-3]
            secondpart = '...' + secondpart
        t_url = firstpart+secondpart
        return t_url
        
    def to_unicode(self, text, charset=None):
        """Convert a `str` object to an `unicode` object.

        If `charset` is given, we simply assume that encoding for the text,
        but we'll use the "replace" mode so that the decoding will always
        succeed.
        If `charset` is ''not'' specified, we'll make some guesses, first
        trying the UTF-8 encoding, then trying the locale preferred encoding,
        in "replace" mode. This differs from the `unicode` builtin, which
        by default uses the locale preferred encoding, in 'strict' mode,
        and is therefore prompt to raise `UnicodeDecodeError`s.

        Because of the "replace" mode, the original content might be altered.
        If this is not what is wanted, one could map the original byte content
        by using an encoding which maps each byte of the input to an unicode
        character, e.g. by doing `unicode(text, 'iso-8859-1')`.
        """
        if not isinstance(text, str):
            if isinstance(text, Exception):
                # two possibilities for storing unicode strings in exception data:
                try:
                    # custom __str__ method on the exception (e.g. PermissionError)
                    return unicode(text)
                except UnicodeError:
                    # unicode arguments given to the exception (e.g. parse_date)
                    return ' '.join([self.to_unicode(arg) for arg in text.args])
            return unicode(text)
        if charset:
            return unicode(text, charset, 'replace')
        else:
            try:
                return unicode(text, 'utf-8')
            except UnicodeError:
                return unicode(text, locale.getpreferredencoding(), 'replace')


